# -*- coding: utf-8 -*-
import scrapy
import time
import json
import re
from ..items import ToutiaospiderItem


class ToutiaoSpider(scrapy.Spider):
    name = 'toutiao'
    allowed_domains = ['toutiao.com']
    # start_urls = ['http://www.toutiao.com/']

    def start_requests(self):
        #定义起始地址
        ofs = 0
        for i in range(11):
            ts = int(time.time())
            ajax_url = "https://www.toutiao.com/api/search/content/?aid=24&app_name=web_search&offset={0}&format=json&keyword=%E7%A7%91%E6%8A%80&autoload=true&count=20&en_qc=1&cur_tab=1&from=search_tab&pd=synthesis&timestamp={1}".format(ofs,ts)
            yield scrapy.Request(url=ajax_url,callback=self.parse,dont_filter=False)
            ofs+=20


    def parse(self, response):
        '''处理来自start_request请求的内容，并采集文章的数据'''
        jdata = json.loads(response.body.decode())["data"]  #解析ajax响应内容，采集文章路径
        url_pattern = 'http://toutiao.com/group/'  #只采集头条官方新闻,过滤掉别的渠道新闻
        try:
            for page in jdata:
                page_url = page["share_url"]
                if url_pattern in page_url:
                    yield scrapy.Request(url=page_url,callback=self.parse_page,dont_filter=False)
        except:
            pass

    def parse_page(self,response):
        html = response.body.decode('utf-8')
        # print(html)
        #取文章特征，过滤掉问答文章
        if 'headerInfo'in html and 'playerInfo' not in html and 'tagInfo' in html :
            #数据清洗
            crawl_time = time.ctime()
            pattern_keyword = '''headerInfo: {
                          id: .*,
                          isPgc: .*,
                          userName: '.*',
                          avatarUrl: '.*',
                          isHomePage: .*,
                          chineseTag: '(.*?)',
                          crumbTag: '.*',
                          hasBar: .*
                        }'''
            try:
                keyword = re.compile(pattern_keyword).findall(html)[0]  # 文章类型
            except:
                keyword = ""
            article_url = response.url  #文章路径
            # print(keyword)
            pattern_article = '''articleInfo: {
      title: '(.*?)',
      content: '(.*?)',
      groupId: '.*',
      itemId: '.*',
      type: .*,
      subInfo: {
        isOriginal: .*,
        source: '(.*?)',
        time: '(.*?)'
      '''
            pattern_commond = '''commentInfo: {
      groupId: '.*',
      itemId: '.*',
      comments_count: (.*?),
      ban_comment: 0
    }'''
            pattern_tag = '''tagInfo: {
        tags: (.*?),
        groupId: '.*',
        itemId: '.*',
        repin: .*,
      },
      has_extern_link: .*,
      coverImg: '.*'
    }'''
            pattern_writer='''mediaInfo: {
      uid: '(.*?)',
      name: '.*',
      avatar: '.*',
      openUrl: '(.*?)',
      follow: .*
    }'''
            try:
                title = re.compile(pattern_article).findall(html)[0][0]  #文章标题
            except:
                title=""
            try:
                content = re.compile(pattern_article).findall(html)[0][1].replace('&lt;','<').replace('&gt;','>').replace('&quot;','').replace('&#x3D;','=') #文章概述
            except:
                content=""
            try:
                datetime = re.compile(pattern_article).findall(html)[0][3] #发布时间
            except:
                datetime =""
            try:
                writer = re.compile(pattern_article).findall(html)[0][2] #作者
            except:
                writer=""
            try:
                comment_count = re.compile(pattern_commond).findall(html)[0]
            except:
                comment_count =""
            try:
                article_type = re.compile(pattern_tag).findall(html)[0]
            except:
                article_type =""
            try:
                user_id =re.compile(pattern_writer).findall(html)[0][0]
                user_url = "https://www.toutiao.com/c/user/{0}/".format(user_id)
                yield scrapy.Request(url=user_url,meta={'crawl_time':crawl_time,'keyword':keyword,'article_url':article_url,'title':title,'content':content,'datetime':datetime,'writer':writer,'comment_count':comment_count,'article_type':article_type,'user_id':user_id},callback=self.parse_user,dont_filter=False)
            except:
                user_id =""

        '''在推荐文章中获得文章路径回调给parse函数'''
        pattern_url = r'https://www.toutiao.com/group/\d+/'
        callback_url = re.compile(pattern_url).findall(html)
        if len(callback_url)>0:
            for i in callback_url:
                yield scrapy.Request(url=i,callback=self.parse_page,dont_filter=False)

    def parse_user(self,response):
        html = response.body.decode('utf-8')
        item = ToutiaospiderItem()
        pattern = "guanzhu:'(.*?)',\n      fensi:'(\d+)"
        if len(re.compile(pattern).findall(html))>0:
            item["guanzhu"] = re.compile(pattern).findall(html)[0][0] #采集关注数
            item["fensi"] = re.compile(pattern).findall(html)[0][1]  #采集粉丝数
            item["crawl_time"] = response.meta["crawl_time"]
            item["keyword"] = response.meta["keyword"]
            item["article_url"] = response.meta["article_url"]
            item["title"] = response.meta["title"]
            item["content"] = response.meta["content"]
            item["datetime"] = response.meta["datetime"]
            item["writer"] = response.meta["writer"]
            item["comment_count"] = response.meta["comment_count"]
            item["article_type"] = response.meta["article_type"]
            item["user_id"] = response.meta["user_id"]
            yield item
        else:
            item["guanzhu"] = ""
            item["fensi"] =""
            item["crawl_time"] = response.meta["crawl_time"]
            item["keyword"] = response.meta["keyword"]
            item["article_url"] = response.meta["article_url"]
            item["title"] = response.meta["title"]
            item["content"] = response.meta["content"]
            item["datetime"] = response.meta["datetime"]
            item["writer"] = response.meta["writer"]
            item["comment_count"] = response.meta["comment_count"]
            item["article_type"] = response.meta["article_type"]
            item["user_id"] = response.meta["user_id"]
            yield item














